Scope of the session

Survey solution and R to implement HSMF creation

Data from the servers and how to manipulate

rm(list = ls())
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.4
library(stringr)
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.0.3
library(rgdal)
## Warning: package 'rgdal' was built under R version 4.0.4
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.0.3
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/KELPTMGD1561/Documents/R/win-library/4.0/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/KELPTMGD1561/Documents/R/win-library/4.0/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/KELPTMGD1561/Documents/R/win-library/4.0/rgdal/proj
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.4
library(htmlwidgets)
## Warning: package 'htmlwidgets' was built under R version 4.0.3
library(readstata13)  #for reading the stata files
library(dplyr)  #data manipulation
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)##foreign
library(htmlTable)##html tables
## Warning: package 'htmlTable' was built under R version 4.0.3
library(magrittr)  #manipulate
## Warning: package 'magrittr' was built under R version 4.0.3
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(loose.rock)  #for proper changing of the string cases
## Warning: package 'loose.rock' was built under R version 4.0.5
library(leaflet)
library(sf)
## Warning: package 'sf' was built under R version 4.0.3
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1

Next steps

Read the data form the .TAB

using the DPLYR package do the following t steps

Select the required names of the variables

Rename the variables using names that make sense

We rename using the Rename function provided by the dplyr package.

NB the new name comes before the old name. Vice versa won’t work

Cluster Data processing

nassepvUpdate <- read.delim("Rladies_Raw_data/rawDataServer/NassepSixUpdate11.tab")
#head(NassepSixUpdate11)



nass_0<-nassepvUpdate %>% select(clu00,clu01,clu02,clu03,clu04,clu05,clu06,clu07,clu08,
                                 clu09,clu10,gps_clus__Latitude,gps_clus__Longitude,
                                 gps_clus__Accuracy,date_int,v1,v2, qw,hs1,perhs1,
                                 hs2,pp_1, perhs2,hs3,perhs3, c0_0,c1_1,c2_2,
                                 c3_3,n1,
                                 c1a,c1b,cc2,dist_ch, cc3,time_ch,c4a,
                                 c4b,dist_tow,c5a,c5b__1,c5b__2,c5b__3,
                                 c5b__4,c5b__5,c5b__6, c5b__7, c6,c7,c8,
                                 c9,c10, c11a,per1,c11b,per2,
                                 c11c,per3,c12,c13,c13b,chief,chieftelphone,
                                 asschief,asschieftelphone,villageelder,
                                 villageeldertelephone,date_end)

### **Rename to make sense**


nass1<-nass_0 %>% rename(ClusterNumber=clu00,County=clu01,Subcounty=clu02,Division=clu03,
      Location=clu04,SubLocation=clu05,GEOCODE=clu06,EAName_1=clu07,EAtype_Residence=clu08,
      EAStatus=clu09,frame_component=clu10,clu_gps_lat=gps_clus__Latitude,clu_gps_long=gps_clus__Longitude,
      clu_gps_accuracy=gps_clus__Accuracy,date_start_interview=date_int,
      Total_Count_homesteads=v1,Total_households_Homesteads=v2,
      cluster_req_seg=qw,households_Seg_1=hs1,Percent_seg_1=perhs1,
      households_Seg_2=hs2,Pop_total_minus_pop_1=pp_1,
      Percent_seg_2=perhs2,hh_Segmet3=hs3,Percent_seg_3=perhs3,
      CumPercenths_0=c0_0,CumPerc_1=c1_1,CumPerc_2=c2_2,
      CumPer_3=c3_3,Last_2_dig_cluster=n1,
      Cluster_Listed=c1a,Why_not_listed=c1b,distance_County_HQ_to_cluster=cc2,dist_Km_Metres=dist_ch,
      Time_County_HQ_to_cluster=cc3,time_hours_minutes=time_ch,appr_town_spend_night=c4a,
      Dist_townorfacility_cluster=c4b,distance_in_KM_Metres=dist_tow,clu_req_security=c5a,
      causes_insecurity_1=c5b__1,causes_insecurity_2=c5b__2,causes_insecurity_3=c5b__3,
      causes_insecurity_4=c5b__4,causes_insecurity_5=c5b__5,causes_insecurity_6=c5b__6,
      causes_insecurity_7=c5b__7, mode_transport=c6,terrain_cluster=c7,size_cluster=c8,
      households_within_cluster=c9,current_settlement_cluster=c10,
      native_languages_1=c11a,Per_1_lang=per1,native_languages_2=c11b,Per_2_lang=per2,
      native_languages_3=c11c,Per_3_lang=per3,
      main_econ_act=c12,EAName=c13,name_EA_asknown=c13b,name_chief=chief,telephone_chief=chieftelphone,
      name_Assist_Chief=asschief,telephone_Ast_chief=asschieftelphone,Village_Elder=villageelder,
      telephone_village_elder=villageeldertelephone,Date_End=date_end)

head(nass1)
##   ClusterNumber  County Subcounty Division Location SubLocation    GEOCODE
## 1        999999 nairobi   ##N/A##  ##N/A##  ##N/A##     ##N/A## -999999999
## 2        999998  kiambu   ##N/A##  ##N/A##  ##N/A##     ##N/A## -999999999
##   EAName_1 EAtype_Residence   EAStatus frame_component clu_gps_lat clu_gps_long
## 1   herufi                2 -999999999      -999999999   -1.303507     36.81034
## 2  mucatha                1 -999999999      -999999999   -1.303354     36.81041
##   clu_gps_accuracy date_start_interview Total_Count_homesteads
## 1            4.900  2021-04-21T14:04:49                      2
## 2           16.413  2021-04-22T09:39:59                      2
##   Total_households_Homesteads cluster_req_seg households_Seg_1 Percent_seg_1
## 1                         300               1              100            33
## 2                         200               1              101            50
##   households_Seg_2 Pop_total_minus_pop_1 Percent_seg_2 hh_Segmet3 Percent_seg_3
## 1              120                   200            40         80            27
## 2               99                    99            50         NA    -999999999
##   CumPercenths_0 CumPerc_1 CumPerc_2   CumPer_3 Last_2_dig_cluster
## 1              0        33        73        100                 99
## 2              0        50       100 -999999999                 98
##   Cluster_Listed Why_not_listed distance_County_HQ_to_cluster dist_Km_Metres
## 1     -999999999             NA                    -999999999             NA
## 2     -999999999             NA                    -999999999             NA
##   Time_County_HQ_to_cluster time_hours_minutes appr_town_spend_night
## 1                -999999999                 NA               ##N/A##
## 2                -999999999                 NA               ##N/A##
##   Dist_townorfacility_cluster distance_in_KM_Metres clu_req_security
## 1                  -999999999                    NA       -999999999
## 2                  -999999999                    NA       -999999999
##   causes_insecurity_1 causes_insecurity_2 causes_insecurity_3
## 1                  NA                  NA                  NA
## 2                  NA                  NA                  NA
##   causes_insecurity_4 causes_insecurity_5 causes_insecurity_6
## 1                  NA                  NA                  NA
## 2                  NA                  NA                  NA
##   causes_insecurity_7 mode_transport terrain_cluster size_cluster
## 1                  NA     -999999999      -999999999   -999999999
## 2                  NA     -999999999      -999999999   -999999999
##   households_within_cluster current_settlement_cluster native_languages_1
## 1                -999999999                 -999999999            ##N/A##
## 2                -999999999                 -999999999            ##N/A##
##   Per_1_lang native_languages_2 Per_2_lang native_languages_3 Per_3_lang
## 1 -999999999                 NA         NA                 NA         NA
## 2 -999999999                 NA         NA                 NA         NA
##   main_econ_act     EAName name_EA_asknown name_chief telephone_chief
## 1       ##N/A## -999999999              NA    ##N/A##         ##N/A##
## 2       ##N/A## -999999999              NA    ##N/A##         ##N/A##
##   name_Assist_Chief telephone_Ast_chief Village_Elder telephone_village_elder
## 1           ##N/A##             ##N/A##       ##N/A##                 ##N/A##
## 2           ##N/A##             ##N/A##       ##N/A##                 ##N/A##
##              Date_End
## 1 2021-04-21T14:16:12
## 2 2021-04-22T09:44:52

Rewrite for use

We rewrite for easy use. Use .tab my favorite due to space and can be used by different software

write.table(nass1, file="Rladies_Final_data/samplers/cluster.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Structure Data processing

###structures
structure<- read.delim("Rladies_Raw_data/rawDataServer/structure.tab")
#head(structure)
nassepStructure<-merge(nassepvUpdate,structure, by="interview__id")
#head(nassepStructure)
clust2<-nassepStructure %>% select(clu00,structure__id,s01,
                        s03_gps__Latitude, s03_gps__Longitude, s03_gps__Accuracy ,
                        s03_gps__Altitude,  s03_gps__Timestamp, s03_gps_1,
                        s04, s05, s06,s06b)  
#head(clust2)  ##clean structure
clust3<-clust2 %>% rename(Cluster_number=clu00,structure_number=structure__id,name_structure=s01,
                          gps_str_latitude=s03_gps__Latitude, gps_str_longitude=s03_gps__Longitude,
                          gps_str_Accuracy=s03_gps__Accuracy,gps_str_Altitude=s03_gps__Altitude,
                          gps_str_Time=s03_gps__Timestamp,PointGPS=s03_gps_1,
                          comments_str=s04,str_residential=s05, comment_purpose=s06,
                          Feauture_type=s06b)

head(clust3)
##   Cluster_number structure_number    name_structure gps_str_latitude
## 1         999999                1 mwenda apartments        -1.303349
## 2         999999                2               ak1        -1.303336
## 3         999998                1                a1        -1.303351
## 4         999998                2                a5        -1.303348
##   gps_str_longitude gps_str_Accuracy gps_str_Altitude        gps_str_Time
## 1          36.81041           12.108           1739.3 2021-04-21T11:08:18
## 2          36.81040           13.511           1733.0 2021-04-21T11:13:58
## 3          36.81041           11.939           1733.7 2021-04-22T06:42:04
## 4          36.81041           11.908           1733.7 2021-04-22T06:44:02
##                             PointGPS comments_str str_residential
## 1 40.1300658751093,-3.21277709768003      ##N/A##               1
## 2 40.1327927923755,-3.21231883289518      ##N/A##               1
## 3 40.1322258095776,-3.21372058335391      ##N/A##               1
## 4 40.1309568480774,-3.21452928466577      ##N/A##               1
##   comment_purpose Feauture_type
## 1              NA            NA
## 2              NA            NA
## 3              NA            NA
## 4              NA            NA
write.table(clust3, file="Rladies_Final_data/samplers/structure.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Household data processing

household <- read.delim("Rladies_Raw_data/rawDataServer/household.tab")
nassepHouseHold<-merge(nassepvUpdate,household, by="interview__id")

cleanHouseUnsort<-subset(nassepHouseHold, select = c( clu00,clusterpart__id,structure__id,household__id,
                                                      h07,h08,h09,h10,h12,h13,h14,h15,h16,h17,h18,h19,
                                                      h20,h21,h23,h24,h24b,h25,h26b,h28))

house2<-cleanHouseUnsort %>%
  mutate(str_add=case_when(
    clusterpart__id==1 ~0,
    clusterpart__id==2 ~40,
    clusterpart__id==3 ~80,
    clusterpart__id==4 ~120,
    clusterpart__id==5 ~160,
    clusterpart__id==6 ~200
  )) %>%
  group_by(clu00) %>%
  mutate(s02=structure__id+str_add)
housfin<-subset(house2, select = c( clu00,s02,h07,h08,h09,h10,h12,h13,h14,h15,h16,h17,h18,h19,
                                    h20,h21,h23,h24,h24b,h25,h26b,h28 ))
#View(housfin)  ##clean household

hous3<-housfin %>% rename(cluster_number=clu00,Structure_number=s02,serial_num_HU=h07,
                          House_number=h08,HU_occupied=h09,reason_unoccupied=h10,
                          Name_HH_Head=h12,sex_head=h13,occupation=h14,Total_people=h15,
                          below_4_male=h16,below_4_female=h17,Five_seventeen_male=h18,
                          Five_seventeen_female=h19,eighteen_Above_male=h20,
                          eighteen_Above_female=h21,operate_business=h23,
                          Telephone_head=h24,Telephone_head_other=h24b,
                          Telephone_other_member=h25,relationship=h26b,comments=h28)
head(hous3)
## # A tibble: 5 x 22
## # Groups:   cluster_number [2]
##   cluster_number Structure_number serial_num_HU House_number HU_occupied
##            <int>            <dbl>         <int> <chr>              <int>
## 1         999999                1             1 a1                     1
## 2         999999                1             2 a2                     1
## 3         999999                2             1 a2                     1
## 4         999998                1             1 a2                     1
## 5         999998                2             1 a2                     3
## # ... with 17 more variables: reason_unoccupied <lgl>, Name_HH_Head <chr>,
## #   sex_head <int>, occupation <chr>, Total_people <int>, below_4_male <int>,
## #   below_4_female <int>, Five_seventeen_male <int>,
## #   Five_seventeen_female <int>, eighteen_Above_male <int>,
## #   eighteen_Above_female <int>, operate_business <int>, Telephone_head <chr>,
## #   Telephone_head_other <chr>, Telephone_other_member <chr>,
## #   relationship <int>, comments <chr>
write.table(hous3, file="Rladies_Final_data/samplers/household.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Next steps is to analyse the cartographers data

Process the features and their gps.

Importance of feautures; Help in identifying the cluster easily during a survey. e.g in Northern counties, a water tank can easily OR determining the cluster boundary, e.g a road or a river

####for catographers cleaning points
#for feautures
r4 <- read.delim("Rladies_Raw_data/rawDataServer/r4.tab")
#head(r4)
#attach(r4)
#View(r4)
r41<-r4 %>% select(r4__id,ft,Clu_feat,sp,name_feature,gps_f__Latitude, gps_f__Longitude,gps_f__Accuracy,gps_f__Timestamp)
head(r41)
##   r4__id ft Clu_feat sp name_feature gps_f__Latitude gps_f__Longitude
## 1      1  1        2 NA river mwenda       -1.303448         36.81061
## 2      2  2        6 NA nairobi rail       -1.303512         36.81065
## 3      1  1        9 NA  boni forest       -1.303357         36.81041
##   gps_f__Accuracy    gps_f__Timestamp
## 1           5.100 2021-04-21T11:21:13
## 2           3.600 2021-04-21T11:22:19
## 3          19.717 2021-04-22T06:39:00

Next we exctract the geopoints which are captures in one cell on the stata, or the Tab output from the server. [one of the most interesting part in this presentation]

Refer to the segmentation and quick count process as had been described above

##codes here exctract the data to be send to catographers for verification mapping and segmenting
s1 <- read.delim("Rladies_Raw_data/rawDataServer/mapperHSMF.tab")
##households/structure during quick count
d3<-s1 %>% select(clu00,s11)
#View(d3)
hom_str<-d3 %>% transmute(cluster_number = clu00,latitude = str_extract_all(s11,'-\\d\\.\\d+'),longitude = str_extract_all(s11, '\\d{2}\\.\\d+')) %>% 
  unnest(cols = everything())
head(hom_str)
## # A tibble: 6 x 3
##   cluster_number latitude          longitude       
##            <int> <chr>             <chr>           
## 1         999999 -3.21234578965294 40.133440772716 
## 2         999999 -3.21479154251722 40.1335737423015
## 3         999999 -3.21644002320421 40.132856469716 
## 4         999999 -3.21511583401042 40.131692593445 
## 5         999999 -3.21627787768103 40.1308941201894
## 6         999999 -3.21385920389999 40.130393382724
write.table(hom_str, file="Rladies_Final_data/cartography/Quick_Count_Structure.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
d4<-s1 %>% select(clu00,segselect,p3)
#d4
d4_1<-d4 %>% filter(!is.na(segselect))
#d4_1
seg_point<-d4_1 %>% transmute(cluster_number = clu00,num_seg=segselect, latitude = str_extract_all(p3,'-\\d\\.\\d+'),longitude = str_extract_all(p3, '\\d{2}\\.\\d+')) %>% 
  unnest(cols = everything())
head(seg_point)
## # A tibble: 6 x 4
##   cluster_number num_seg latitude          longitude       
##            <int>   <int> <chr>             <chr>           
## 1         999999       2 -3.21732748931697 40.126832186803 
## 2         999999       2 -3.21685917444566 40.1272103259819
## 3         999999       2 -3.21661750025944 40.1275055156961
## 4         999999       2 -3.21605752326633 40.1277948016159
## 5         999999       2 -3.21546217896797 40.1278302243816
## 6         999999       2 -3.21494935736305 40.127777090233
write.table(seg_point, file="Rladies_Final_data/cartography/SEG_points.tab", 
            na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Centroids are important in determining the central most part of the cluster

Refer to an interview with show location in the phone. This to tell the distance, terrain and cost of getting to the cluster.

Centroid segment 1

d6 <- s1 %>% select(clu00,d1)

seg_1 <- d6 %>% 
  transmute(cluster_number = clu00,
            latitude = str_extract_all(d1,'-\\d\\.\\d+'),
            longitude = str_extract_all(d1, '\\d{2}\\.\\d+')) %>% 
  unnest(cols = everything())

write.table(seg_1, file="Rladies_Final_data/cartography/segment_1_centroids.tab", 
            na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

centroid segment 2

d7 <- s1 %>% 
  select(clu00,d2)

seg_2 <- d7 %>% 
  transmute(cluster_number = clu00,
            latitude = str_extract_all(d2,'-\\d\\.\\d+'),
            longitude = str_extract_all(d2, '\\d{2}\\.\\d+')) %>% 
  unnest(cols = everything())

write.table(seg_2, file="Rladies_Final_data/cartography/segment_2_centroids.tab", 
            na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

centroid segment 3

d8 <- s1 %>% select(clu00,d3)

seg_3 <- d8 %>% transmute(cluster_number = clu00,
                          latitude = str_extract_all(d3,'-\\d\\.\\d+'),
                          longitude = str_extract_all(d3, '\\d{2}\\.\\d+')) %>% 
  unnest(cols = everything())

write.table(seg_3, file="Rladies_Final_data/cartography/segment_3_centroids.tab", 
            na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Steps to check the quality of the data

Comparing the mapper and the lister data

hm1 <- read.delim("Rladies_Raw_data/rawDataServer/hm1.tab")
m1<-hm1 %>% select(interview__id,hm1__id) %>% as.data.frame()
j1<-nassepvUpdate %>% select(interview__id,clu00,v1,v2)%>% as.data.frame()
n1<-m1 %>% left_join(j1, by="interview__id")
n2<-n1 %>% select(clu00,hm1__id,v1,v2) %>%
  rename(cluster_number=clu00, str_id=hm1__id, total_str=v1, tot_hous=v2)%>%as.data.frame()
kk1<-hom_str %>% group_by(cluster_number) %>% mutate(str_id = row_number())%>%as.data.frame()
#left joining when the MAPPER is super data

n3<-kk1 %>% left_join(n2, by=c("cluster_number", "str_id"))
n31<-n3 %>% select(cluster_number,str_id,total_str,tot_hous,longitude,latitude)%>%
  rename(cluster_number=cluster_number,str_id_MAPPER=str_id, total_str_LISTER=total_str,
         tot_hous_LISTER=tot_hous, longitude=longitude,latitude=latitude)

write.table(n31, file="Rladies_Final_data/Quality/aMAPPER.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
#left joining when the LISTER is super data
n4<-n2 %>% left_join(kk1, by=c("cluster_number", "str_id"))
n41<-n4 %>% select(cluster_number,str_id,total_str,tot_hous,longitude,latitude)%>%
  rename(cluster_number=cluster_number,str_id_LISTER=str_id, total_str_LISTER=total_str,
         tot_hous_LISTER=tot_hous,  longitude=longitude,latitude=latitude)

write.table(n41, file="Rladies_Final_data/Quality/aLISTER.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
#full join MAPPER and LISTER to get differences
d<-full_join(n2,kk1, by=c("cluster_number","str_id"))

write.table(d, file="Rladies_Final_data/Quality/bothLISTER_MAPPER.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Confirm that the struictures captured by the MAPPER during quick count compare with the LISTER

Points of households from the mapper

q1<-hom_str %>% group_by(cluster_number) %>% summarise(total_structure_MAPPER=n())%>% as.data.frame()
#Total listed by the LISTER during quick count
p1<-j1 %>% select(clu00, v1) %>% rename(cluster_number=clu00,total_structure_LISTER=v1)%>%as.data.frame()
c <- full_join(p1,q1, by="cluster_number")
write.table(c, file="Rladies_Final_data/Quality/LIST_MAP_AGGREGATE.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')

Cluster Listing Ends here

###—————————————————–cluster update starts here

Next we look at how to use R for Updating clusters

As mentioned earlier, updating has three levels of hierachy. Cluster, structure then household.

There is need for proper updating without loosing any household, or having any extras

This is with the aim of maintaining randomness during sampling and avoid bias.

We show capabilities of R in providing a flexible environment to work with other softwares through data manipulation

Household numbers are assigned by the samplers

Importance of preloads

Preload are important as you dont have to collect data afresh, but you only modify what you have

very good for longitudinal/panel data collection

Read the datasets

clu_main <- read.delim("Rladies_Raw_data/raw_data_for_preload/cluster_1.tab")

hh_main <- read.delim("Rladies_Raw_data/raw_data_for_preload/household_1.tab")

str_main <- read.delim("Rladies_Raw_data/raw_data_for_preload/structure_1.tab")

A household is defined as a person or group of people who, report to the same head, have same cooking arrangements, should live in a common place

A household number should be unique within the cluster

Householdsshould be between 50 and 149 households

Check Duplicated and missing households numbers in the datasets and remove if any
hh_main2 <- hh_main %>% 
  filter(!is.na(h07)) #filter any missing household number

hh_tot <- hh_main2 %>% 
  group_by(cluster_number) %>% 
  summarise(totHh=n()) %>% 
  arrange(totHh)
#check if there are repeated serialization withing households in the structure

hh_uniq <- hh_main2 %>% 
  group_by(cluster_number,Structure_number,serial_num_HU)%>% 
  filter(n()>1)##find any duplicates in the dataset after remove h07

hh_uniq #duplicated households
## # A tibble: 0 x 23
## # Groups:   cluster_number, Structure_number, serial_num_HU [0]
## # ... with 23 variables: cluster_number <int>, Structure_number <int>,
## #   serial_num_HU <int>, House_number <chr>, HU_occupied <int>, h07 <int>,
## #   reason_unoccupied <lgl>, Name_HH_Head <chr>, sex_head <int>,
## #   occupation <chr>, Total_people <int>, below_4_male <int>,
## #   below_4_female <int>, Five_seventeen_male <int>,
## #   Five_seventeen_female <int>, eighteen_Above_male <int>,
## #   eighteen_Above_female <int>, operate_business <int>, Telephone_head <chr>,
## #   Telephone_head_other <chr>, Telephone_other_member <chr>,
## #   relationship <int>, comments <chr>

Find any duplicated structure number if any in the cluster

Structures are unique

#Get total sum of structutes in a given cluster
str_tot <- str_main %>% 
  group_by(cluster_number)%>% 
  summarise(totStr=n()) %>% 
  arrange(totStr)

str_uniq<-str_main %>% 
  group_by(cluster_number,Structure_number)%>% 
  filter(n()>1)##find any duplicates in the structure

str_uniq##filter any duplicates structues 
## # A tibble: 0 x 13
## # Groups:   cluster_number, Structure_number [0]
## # ... with 13 variables: cluster_number <int>, Structure_number <int>,
## #   name_structure <chr>, gps_str_latitude <dbl>, gps_str_longitude <dbl>,
## #   gps_str_Accuracy <dbl>, gps_str_Altitude <dbl>, gps_str_Time <chr>,
## #   PointGPS <chr>, comments_str <chr>, str_residential <int>,
## #   comment_purpose <lgl>, Feauture_type <lgl>

Merging the structure and the household data to prepare the preload files

#joining the structures and the households fo coding
hh_str_TT <- str_tot %>% 
  left_join(hh_tot, by="cluster_number") 

#head(hh_str_TT)
##select the cluster number for the selected structures
##create a new str filtered depending on number of MAIN str in a cluster will be used for analysis

#str_sub <- str_tot %>% 
#  filter(totStr>50) #greater than 50 structures in the cluster
#head(str_sub)

#######--------------------------------clusters number selected for use
cluster_Numbers <- str_tot$cluster_number
###--------------------------------------------------------END CLUSTER NUMBERS

#--------------------------------------------------------start creating clusters
##
cluster <- clu_main %>% 
  filter(cluster_number %in% cluster_Numbers)

cluster$interview__id=cluster$cluster_number

str <- str_main %>% 
  filter(cluster_number %in% cluster_Numbers)

str$interview__id=str$cluster_number

hh <- hh_main2 %>% 
  filter(cluster_number %in% cluster_Numbers)

hh$interview__id=hh$cluster_number

##create the nassepv tab
#create the interview__id
###write the first nassep VI tab 
str_gr <- str %>% 
  select(cluster_number,gps_str_latitude,gps_str_longitude,gps_str_Altitude)

str_gr2 <- aggregate(str_gr, list(str_gr$cluster_number), FUN=head, 1)

str_gr3 <- str_gr2 %>% 
  select(cluster_number,gps_str_latitude,gps_str_longitude,gps_str_Altitude)

g1 <- str_gr3 %>% 
  rename(gps_clus__Latitude = gps_str_latitude, gps_clus__Longitude = gps_str_longitude, gps_clus__Altitude = gps_str_Altitude)%>%
  as.data.frame()
#g1
#head(g1)

cluster1 <- cluster %>%as.data.frame()

cluster1$clu00 <- as.integer(cluster1$cluster_number)

clu <- cluster1 %>% left_join(g1, by="cluster_number")

head(clu)
##   cluster_number  County Subcounty Division Location SubLocation    GEOCODE
## 1         999999 nairobi   ##N/A##  ##N/A##  ##N/A##     ##N/A## -999999999
## 2         999998  kiambu   ##N/A##  ##N/A##  ##N/A##     ##N/A## -999999999
##   EAName_1 EAtype_Residence   EAStatus frame_component clu_gps_lat clu_gps_long
## 1   herufi                2 -999999999      -999999999   -1.303507     36.81034
## 2  mucatha                1 -999999999      -999999999   -1.303354     36.81041
##   clu_gps_accuracy date_start_interview Total_Count_homesteads
## 1            4.900  2021-04-21T14:04:49                      2
## 2           16.413  2021-04-22T09:39:59                      2
##   Total_households_Homesteads cluster_req_seg households_Seg_1 Percent_seg_1
## 1                         300               1              100            33
## 2                         200               1              101            50
##   households_Seg_2 Pop_total_minus_pop_1 Percent_seg_2 hh_Segmet3 Percent_seg_3
## 1              120                   200            40         80            27
## 2               99                    99            50         NA    -999999999
##   CumPercenths_0 CumPerc_1 CumPerc_2   CumPer_3 Last_2_dig_cluster
## 1              0        33        73        100                 99
## 2              0        50       100 -999999999                 98
##   Cluster_Listed Why_not_listed distance_County_HQ_to_cluster dist_Km_Metres
## 1     -999999999             NA                    -999999999             NA
## 2     -999999999             NA                    -999999999             NA
##   Time_County_HQ_to_cluster time_hours_minutes appr_town_spend_night
## 1                -999999999                 NA               ##N/A##
## 2                -999999999                 NA               ##N/A##
##   Dist_townorfacility_cluster distance_in_KM_Metres clu_req_security
## 1                  -999999999                    NA       -999999999
## 2                  -999999999                    NA       -999999999
##   causes_insecurity_1 causes_insecurity_2 causes_insecurity_3
## 1                  NA                  NA                  NA
## 2                  NA                  NA                  NA
##   causes_insecurity_4 causes_insecurity_5 causes_insecurity_6
## 1                  NA                  NA                  NA
## 2                  NA                  NA                  NA
##   causes_insecurity_7 mode_transport terrain_cluster size_cluster
## 1                  NA     -999999999      -999999999   -999999999
## 2                  NA     -999999999      -999999999   -999999999
##   households_within_cluster current_settlement_cluster native_languages_1
## 1                -999999999                 -999999999            ##N/A##
## 2                -999999999                 -999999999            ##N/A##
##   Per_1_lang native_languages_2 Per_2_lang native_languages_3 Per_3_lang
## 1 -999999999                 NA         NA                 NA         NA
## 2 -999999999                 NA         NA                 NA         NA
##   main_econ_act     EAName name_EA_asknown name_chief telephone_chief
## 1       ##N/A## -999999999              NA    ##N/A##         ##N/A##
## 2       ##N/A## -999999999              NA    ##N/A##         ##N/A##
##   name_Assist_Chief telephone_Ast_chief Village_Elder telephone_village_elder
## 1           ##N/A##             ##N/A##       ##N/A##                 ##N/A##
## 2           ##N/A##             ##N/A##       ##N/A##                 ##N/A##
##              Date_End interview__id  clu00 gps_clus__Latitude
## 1 2021-04-21T14:16:12        999999 999999          -1.303349
## 2 2021-04-22T09:44:52        999998 999998          -1.303351
##   gps_clus__Longitude gps_clus__Altitude
## 1            36.81041             1739.3
## 2            36.81041             1733.7
write.table(clu, file="Rladies_Final_data/created_preload_files/nassepvUpdate.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
attach(str)

str2 <- str %>% select(interview__id,Structure_number)

data_nas <- str2 %>% 
  group_by(interview__id) %>% 
  mutate(clusterpart__id = ifelse(Structure_number %in% 1:40,1,
                                  ifelse(Structure_number%in% 41:80,2,
                                         ifelse(Structure_number%in% 81:120,3,
                                                ifelse(Structure_number%in% 121:160,4,
                                                       ifelse(Structure_number%in% 161:200,5,6)))))) %>%
  ungroup() %>%
  group_by(interview__id,clusterpart__id) %>% 
  mutate(val=1:n()-1,nid = cur_group_id()) %>% ungroup() %>%
  select(-clusterpart__id) %>% mutate(interview__id=paste0(interview__id,'.',nid),val=paste0('Structure_number','__',val)) %>% select(-nid) %>%
  pivot_wider(names_from = c(val),values_from = Structure_number) %>%
  mutate(interview__id=gsub("\\..*","", interview__id)) %>% group_by(interview__id) %>%
  mutate(clusterpart__id=1:n()) #%>% select(order(colnames(.)))
head(data_nas)
## # A tibble: 2 x 4
## # Groups:   interview__id [2]
##   interview__id Structure_number__0 Structure_number__1 clusterpart__id
##   <chr>                       <int>               <int>           <int>
## 1 999999                          1                   2               1
## 2 999998                          1                   2               1
###write the cluster tab 
write.table(data_nas, file="Rladies_Final_data/created_preload_files/clusterpart.tab", na = "##NA##",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
############################### Creating the structure id based on cluster__id and interview__id from cluster tab 
d3<-data_nas %>% 
  pivot_longer(cols = starts_with('Structure_number'), values_to = 'Structure_number') %>%    
  group_by(interview__id, clusterpart__id) %>% 
  mutate(structure__id = row_number())

d4 <- d3 %>% 
  filter(!is.na(Structure_number)) %>% 
  select(-name)

##manipulate to create the household tab

#check if there is any missing h07 household number and drop it
hh1 <- hh %>% drop_na(h07)  #drop missing household numbers

hh2 <- hh1 %>% group_by(cluster_number,h07) %>%filter(n() > 1)#no duplicated households in a cluster
hh3 <- hh2 %>% group_by(cluster_number,Structure_number, serial_num_HU) %>%filter(n() > 1)#no duplicated households in a structure
hh4 <- subset(hh3, below_4_male+below_4_female+Five_seventeen_male+Five_seventeen_female+eighteen_Above_male+eighteen_Above_female != Total_people) ##sum of members in different households age category sum to the total in the household
#Exctract the interview id and cluster part id from the created cluster data frame
d4$interview__id <- as.integer(d4$interview__id)

hh5 <- hh1 %>% left_join(d4, by=c("interview__id","Structure_number")) #%>% arrange(interview__id, clusterpart__id,s02)

hh6 <- hh5 %>% 
  group_by(interview__id,structure__id) %>% 
  mutate(household__id= row_number(structure__id))

hhf <- hh6 %>% 
  select(interview__id, clusterpart__id,    structure__id,  household__id,Structure_number, serial_num_HU,  h07)

hhff <- hhf %>% select(-Structure_number)

head(hhff)
## # A tibble: 5 x 6
## # Groups:   interview__id, structure__id [4]
##   interview__id clusterpart__id structure__id household__id serial_num_HU   h07
##           <int>           <int>         <int>         <int>         <int> <int>
## 1        999999               1             1             1             1     1
## 2        999999               1             1             2             2     2
## 3        999999               1             2             1             1     3
## 4        999998               1             1             1             1     1
## 5        999998               1             2             1             1     2
##############-----------------------------final household tab
write.table(hhff, file="Rladies_Final_data/created_preload_files/household.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
##############-----------------------------final household tab
###########################------------------------------------creating structure sheet
############-------------------------------------------------------------------------------------------------str
hh55 <- hh5 %>% 
  select(interview__id, Structure_number) %>%
  group_by(interview__id,Structure_number)%>% 
  summarise() %>%
  as.data.frame()
## `summarise()` has grouped output by 'interview__id'. You can override using the `.groups` argument.
d44 <- d4 %>% 
  as.data.frame() %>% 
  select(interview__id, Structure_number) 

dz1 <- d44 %>%
  anti_join(hh55, by=c("interview__id", "Structure_number"))

dz2 <- dz1 %>%
  left_join(d4, by=c("interview__id","Structure_number"))

dz3 <- merge(hh5,dz2,by=c("interview__id","clusterpart__id","structure__id","Structure_number")) 

dz4 <- dplyr::bind_rows(dz2, hh5)

################--------------------------------------------------------------------------------------------str
hhg <- dz4 %>% 
  select(interview__id ,clusterpart__id,structure__id,Structure_number, serial_num_HU,h07) 

str2 <- hhg %>%
  mutate(serial_num_HU = serial_num_HU - 1) %>% 
  pivot_wider(names_from = serial_num_HU,values_from = h07,names_prefix = "h06b__")

##merge with some selected variables from str
str3 <- str %>% 
  select(interview__id,Structure_number,gps_str_latitude,gps_str_longitude,gps_str_Altitude)

final.str <- str2 %>% 
  left_join(str3, by=c("interview__id","Structure_number"))


str22 <- final.str  #  %>% 
 # select(-h06b__NA)

str23 <- str22 %>% 
#  rename(gps__Latitude=s04,gps__Longitude=s05,gps__Altitude=s06)%>%
  as.data.frame()

str24 <- str23[order(str23$interview__id,str23$clusterpart__id,str23$structure__id,str23$Structure_number),]#,clusterpart__id,structure__id,s02

head(str24)
##   interview__id clusterpart__id structure__id Structure_number h06b__0 h06b__1
## 3        999998               1             1                1       1      NA
## 4        999998               1             2                2       2      NA
## 1        999999               1             1                1       1       2
## 2        999999               1             2                2       3      NA
##   gps_str_latitude gps_str_longitude gps_str_Altitude
## 3        -1.303351          36.81041           1733.7
## 4        -1.303348          36.81041           1733.7
## 1        -1.303349          36.81041           1739.3
## 2        -1.303336          36.81040           1733.0
write.table(str24, file="Rladies_Final_data/created_preload_files/structure.tab", na = "",
            row.names = F, col.names = T,   quote = F,   sep = '\t')
########################-------------------------------------final structure tab

Next steps that may draw interest and call for collaborations

start development of flexi dashboards for realtime data collection monitoring

Next steps spatial distribution and download of the data at the bureau. Shinny app for this

Switch to QGIS for DEMO

Spatial Distribution of the population for the 2009 census using the shapefiles for Makueni District

kenya <- readOGR("Rladies_Raw_data/demoShp", "Makueni_District")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded ellps Clarke 1880 (RGS) in Proj4 definition: +proj=longlat
## +a=6378249.145 +rf=293.465 +no_defs
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS
## = dumpSRS, : Discarded datum Arc_1960 in Proj4 definition: +proj=longlat
## +a=6378249.145 +rf=293.465 +no_defs
## Warning in showSRID(wkt2, "PROJ"): Discarded ellps Clarke 1880 (RGS) in Proj4
## definition: +proj=longlat +a=6378249.145 +rf=293.465 +no_defs +type=crs
## Warning in showSRID(wkt2, "PROJ"): Discarded datum Arc 1960 in Proj4 definition
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\KELPTMGD1561\OneDrive - Norwegian Refugee Council\Scripts\R Ladies meetups\R-Ladies Nairobi github materials\R for official statistics\Rladies_Raw_data\demoShp", layer: "Makueni_District"
## with 54 features
## It has 30 fields
bins <- c(1, 500, 1000, 2500, 5000, 10000, 100000, 1000000, 3000000) #here we set the binning manually and store it as a vector
pal <- colorBin("RdYlBu", domain = kenya$SUM_Househ, bins = bins)  #leaflet function to map data values to a color pallete. Colorbin returns a function, not a value.

labels <- paste(kenya$SLNAME, round(kenya$SUM_Househ, digits = 1))%>%
  lapply(htmltools::HTML)


m <- leaflet() %>%
  leaflet::addTiles() %>%  # Add default OpenStreetMap map tiles
  leaflet::addPolygons(data = kenya,
                       weight = 1, #polygon line weight
                       smoothFactor = 1, #smoothFactor addition smooths out the shapefile polylines to make it load faster. Values < 1 adds detail while values > 1 take detail away.
                       color = "white", #line color
                       fillOpacity = 0.8, #polygon opacity
                       fillColor = pal(kenya$SUM_Househ), #polygon color
                       label = labels
  )  %>%
 
 
  leaflet::addLegend(pal = pal, #adds legend
                     values = kenya$SUM_Househ,
                     opacity = 0.7, #legend opacity
                     position = "topright")  #legend position
m

Spatial Mapping of some Random points to show coverage

kenya.polys <- st_read("migori/Migori_District.shp")
## Reading layer `Migori_District' from data source `C:\Users\KELPTMGD1561\OneDrive - Norwegian Refugee Council\Scripts\R Ladies meetups\R-Ladies Nairobi github materials\R for official statistics\migori\Migori_District.shp' using driver `ESRI Shapefile'
## Simple feature collection with 164 features and 30 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 33.94061 ymin: -1.386303 xmax: 34.73015 ymax: -0.6478643
## geographic CRS: Arc 1960
kenya.points <- read.delim("Rladies_Raw_data/rawDataServer/randomPointsMigori.tab", header = TRUE, stringsAsFactors = FALSE, quote = "", sep = "\t")

kenya.points <- st_as_sf(kenya.points, coords = c("longitude", "latitude"), crs = 4326)

leaflet() %>%
  addProviderTiles(providers$Esri.WorldImagery,
                   options = providerTileOptions(noWrap = TRUE)) %>%
  addCircles(data = kenya.points, color = "red") %>%
  addPolygons(data = kenya.polys,
              weight = 1)
## Warning: sf layer has inconsistent datum (+proj=longlat +a=6378249.145 +rf=293.465 +no_defs).
## Need '+proj=longlat +datum=WGS84'

END

Codes and the dataset available freely at my github page https://github.com/samwenda

If you are writting a paper and you found our work useful, please acknowledge source to allow other researchers to benefit